from elpisciDSFuncs.scrnaseq import *
from nbdev.showdoc import show_doc
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!ls
adata = sc.read('pmid33837218-GSE160048.h5ad')
adata
CLEC = []
for i in adata.raw.var_names:
if 'CLEC' in i:
CLEC.append(i)
# import matplotlib.pyplot as plt
plt.rcParams['axes.ymargin'] = 0
plt.rcParams['axes.spines.left'] = True
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.bottom'] = True
sc.settings.set_figure_params(dpi=100, facecolor='white',fontsize=10,figsize=(5, 5))
sc.pl.dotplot(
adata,
{'CLECs':CLEC},
groupby='Cell_subtype',
return_fig=True,#title='tissue:'+t,
mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True
# dot_max=0.5,
# dot_min=0,
# vmin=0,
# vmax=1.2,
).add_totals(color='black',size=0.5).show()
sc.pl.dotplot(
adata,'IL6',
groupby='Cell_subtype',
# return_fig=True,#title='tissue:'+t,
mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True
# dot_max=0.5,
# dot_min=0,
# vmin=0,
# vmax=1.2,
)
sc.pl.dotplot(adata,['IL6','MKI67'],groupby='Cell_subtype',
mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True,
dot_max=0.15,
dot_min=0,
)
sc.pl.dotplot(
adata,
['TNF','TNFRSF1A','TNFRSF1B','FCAR'],
groupby='Cell_subtype',mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True)
sc.settings.set_figure_params(dpi=100, facecolor='white',fontsize=10,figsize=(5, 5))
sc.pl.dotplot(
adata,'n_genes_by_counts',
groupby='Cell_subtype',
return_fig=True,#title='tissue:'+t,
mean_only_expressed=True
# dot_max=0.5,
# dot_min=0,
# vmin=0,
# vmax=1.2,
).add_totals(color='black',size=0.5).show()
# !head -5 GSE160048_human_glom_single_cell_rpkms.txt
exp_matrix = pd.read_csv('GSE160048_human_glom_single_cell_rpkms.txt',index_col=False, nrows=None,sep='\t')
len(exp_matrix.keys())
# 767
exp_matrix.shape
# (16988, 767)
exp_matrix['SYMBOL'] = exp_matrix['Unnamed: 0'].apply(lambda x:x.split('|')[0])
len(np.unique(test['SYMBOL']))
# 16939
col = []
for i in exp_matrix.keys():
if '_' not in i:continue
col.append(i)
col = ['SYMBOL']+col
len(col)
# 767
注意这里去掉 ERCC_的写法,为了后续用简答的 nromalization method¶exp_matrix = exp_matrix[exp_matrix['SYMBOL'].apply(lambda x:x[:5]!='ERCC_')]
exp_matrix.shape
# (16988, 767)
# (16927, 768)
exp_matrix = exp_matrix[col]
exp_matrix.set_index('SYMBOL',inplace=True)
exp_matrix = exp_matrix.T
### 输出到文件
exp_matrix.to_csv('GSE160048_human_glom_single_cell_rpkms.format.tsv',sep='\t',index=None)
exp_matrix.describe()
adata = sc.read_csv(
'GSE160048_human_glom_single_cell_rpkms.format.tsv', # the directory with the `.mtx` file
delimiter='\t',
first_column_names='SYMBOL',
dtype= 'float32',
# use gene symbols for the variable names (variables-axis index)
)
adata
adata.var_names_make_unique()
adata.obs_names_make_unique()
adata
# sc.pl.highest_expr_genes(adata, n_top=20, )
# sc.pp.filter_cells(adata, min_genes=200)
# sc.pp.filter_genes(adata, min_cells=3)
### 因为不能再丢数据了,所以不做额外过滤
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
adata = adata[adata.obs.n_genes_by_counts < 7000, :]
# sc.pp.normalize_total(adata, target_sum=1e4)
# Total-count normalize (library-size correct) the data matrix 𝐗 to 10,000 reads per cell, so that counts become comparable among cells.
sc.pp.log1p(adata,base=2)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata)
sc.pl.pca_variance_ratio(adata, log=True)
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
sc.tl.leiden(adata)
adata
# import matplotlib.pyplot as plt
plt.rcParams['axes.ymargin'] = 0
plt.rcParams['axes.spines.left'] = True
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.bottom'] = True
for i in exp_matrix.keys():
if 'CD4' in i:
print(i)
sc.settings.set_figure_params(dpi=60, facecolor='white')
sc.pl.umap(adata,
color=[
'n_genes_by_counts',
'NPHS2','PODXL','NPHS1','NPHS2','IGFBP7', #podocyte
'leiden'
],
size=80,
ncols=5,
use_raw=True,
)
# cell_type_anno = {}
# cell_type_anno['Podo'] = ['7']
adata.obs['leiden'].value_counts()
sc.pl.umap(adata,
color=[
'n_genes_by_counts',
'PECAM1','SPP1','leiden'
],
# size=100,
ncols=5,use_raw=True,legend_loc='on data',palette='Set3')
###
# cell_type_anno['Tubules'] = ['3']
cell_type_anno = {}
cell_type_anno['Podo' ] = ['7']
cell_type_anno['Tubules' ] = ['3']
cell_type_anno['T&NK' ] = ['5']
cell_type_anno['Mφ&Μono'] = ['1','9','2']
cell_type_anno['MLC' ] = ['4'] #PDGFRB
cell_type_anno['GEC' ] = ['0','8','6']
# !DCT: distal convoluted tubule;
# !GEC: glomerular endothelial cells
# !MLC: mesangial-like cell;
# !MNP: mononuclear phagocyte;
# !PTC: proximal tubule cell;
# !T+NK: T cell and NK cells
# !cTAL: cortical thick ascending limb
# !CD: collecting duct;
leiden_cell_type = {}
for i,j in cell_type_anno.items():
for l in j:
leiden_cell_type[l] = i
adata.obs['Cell_subtype'] = adata.obs.leiden.apply(lambda x:leiden_cell_type[x])
adata.obs['Cell_subtype'].value_counts()
# X_tsne_list = []
# for i,j in adata.obs.iterrows():
# ### 把原来的位置都保存进来
# X_tsne_list.append([j.Global_tSNE_1,j.Global_tSNE_2,j.Sub_tSNE_1,j.Sub_tSNE_2])
# # X_tsne_list.append([j.Sub_tSNE_1,j.Sub_tSNE_2])
# replace_X_tsne = np.array(X_tsne_list)
# type(replace_X_tsne)
# adata.obsm['X_tsne']=replace_X_tsne
# adata.obs['leiden'].value_counts()
# help(sc.pl.umap)
强烈注意这里 raw 的写入的用法,避免丢失基因表达数据¶sc.pl.umap(adata,
color=[
'NPHS2', #podocyte
'PTPRC',# CD45
'CD3D','CD8A',# T cell
# 'leiden'
'Cell_subtype'
],
size=100,
ncols=5,
use_raw=True,palette='Set3',legend_loc='on data')
#######
# cell_type_anno['T&NK'] = ['5']
sc.pl.umap(adata,
color=['TNF','TGFB1','IL6','Cell_subtype'],
ncols=5,
use_raw=True,palette='Set3',legend_loc='on data')
sc.pl.umap(adata,
color=['Cell_subtype'],
ncols=5,
use_raw=True,palette='Set3',legend_loc='on data')
sc.settings.set_figure_params(dpi=60, facecolor='white',fontsize=16,figsize=(5, 5))
sc.pl.umap(adata, color=['leiden'],ncols=4,size=100,legend_loc='on data',palette='Set3')
# cell_type_anno['Mφ&Μono'] = ['1','9','2']
# cell_type_anno['MLC'] = ['4'] #PDGFRB
sc.pl.umap(adata,
color=[
'n_genes_by_counts',
'NPHS2', #podocyte
'CD3D','PTPRC',# T cell
'TNF',
'FCN1','CD14',
'FCGR3A',# CD16
'FCGR2A','FCGR2B',
'FCAR',# FCAR CD89 (IgA1,2 receptor)
'MRC1',# CD206
'CD86','CD163',
'C1QA',#'C1QB','C1QC',
'MERTK',
'CSF1',
'SPP1',
'PDGFRB', #MLC
'FHL2',# MC, mesangial cell;
],
# size=80,
ncols=5,
use_raw=True
)
# cell_type_anno['GEC'] = ['0','8','6']
sc.pl.umap(adata,
color=[
'n_genes_by_counts',
'PECAM1','leiden'
],
# size=100,
ncols=5,use_raw=True,legend_loc='on data',palette='Set3')
sc.settings.set_figure_params(dpi=60, facecolor='white',fontsize=16,figsize=(5, 5))
sc.pl.umap(adata,
color=[
'n_genes_by_counts',
'NPHS2', #podocyte
'CD3D','PTPRC',# T cell
'TNF',
'FCGR2A','FCGR2B',
'FCAR',# FCAR CD89 (IgA1,2 receptor)
'CD86','CD163',
'PECAM1',
'SPP1',
'PDGFRB',
'FHL2',# MC, mesangial cell;
'NT5E','PDGFRB',
'CLDN4','AQP3',## PC, principal cells;
'CUBN','SLC12A3'
],
# size=100,
ncols=5,
use_raw=True
)
adata.write('pmid33837218-GSE160048.h5ad')
sc.settings.set_figure_params(dpi=100, facecolor='white',fontsize=10,figsize=(5, 5))
sc.pl.dotplot(
adata,
{'genes':['IL6','TNF','TGFB1'
# 'Trem2','Adgre1',
]},
groupby='Cell_subtype',
return_fig=True,#title='tissue:'+t,
mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True
# dot_max=0.5,
# dot_min=0,
# vmin=0,
# vmax=1.2,
).add_totals(color='black',size=0.5).show()
sc.pl.dotplot(
adata,'IL6',
groupby='Cell_subtype',
# return_fig=True,#title='tissue:'+t,
mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True
# dot_max=0.5,
# dot_min=0,
# vmin=0,
# vmax=1.2,
)
sc.pl.dotplot(adata,['IL6','MKI67'],groupby='Cell_subtype',
mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True,
dot_max=0.15,
dot_min=0,
)
sc.pl.dotplot(
adata,
['TNF','TNFRSF1A','TNFRSF1B','FCAR'],
groupby='Cell_subtype',mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True)
sc.settings.set_figure_params(dpi=100, facecolor='white',fontsize=10,figsize=(5, 5))
sc.pl.dotplot(
adata,'n_genes_by_counts',
groupby='Cell_subtype',
return_fig=True,#title='tissue:'+t,
mean_only_expressed=True
# dot_max=0.5,
# dot_min=0,
# vmin=0,
# vmax=1.2,
).add_totals(color='black',size=0.5).show()
# ?sc.pl.dotplot
sc.pl.dotplot(adata,'MKI67',groupby='Cell_subtype',
mean_only_expressed=True,colorbar_title='Log2RPMK',use_raw=True,
dot_max=0.15,
dot_min=0,
)